********************************************************************************
*                                Set Directory                                 *
********************************************************************************
clear all

global user "~\SHEPad\data"
cd "${user}"

********************************************************************************
*                        Clean SHE Pad Treatment Data                          *
********************************************************************************

* import raw data
import delimited "${user}\raw_shepad\Data on sanitary pad installation by year.csv", clear

* drop unnecessary variables and obserations without school names
drop if missing(nameofschool)
drop v2*

* extract schoolid and district
gen schoolid=stritrim(strtrim(substr(nameofschool,1,5)))
gen districtname=upper(stritrim(strtrim(substr(district,5,.))))

* extract schoolname (remove district name at the end of the string)
gen schoolname=stritrim(strtrim(substr(nameofschool,8,.)))
replace schoolname=stritrim(strtrim(subinstr(schoolname,".","",.)))
moss schoolname, match(",")
gen last_pos=_pos1 if _count==1
forval i=2(1)4 {
replace last_pos=_pos`i' if _count==`i'
}
gen district_dum=stritrim(strtrim(substr(schoolname,last_pos+1,.)))
replace schoolname=stritrim(strtrim(substr(schoolname,1,last_pos-1))) if district_dum==districtname
drop district_dum *_pos* _count

* gen unique school id
egen id2=group(schoolname districtname)

* save data
save "${user}\spadtemp.dta", replace

********************************************************************************
*                    Match SHE Pad Treatment Data to DISE                      *
********************************************************************************

* separate treatment data into several files by district
use spadtemp, clear
foreach d in ala ern idu kan kas kol kot koz mal pal pat thi thr way {
preserve
gen dn = strlower(districtname)
keep if strpos(dn,"`d'")==1
keep id2 schoolname schoolid
rename schoolname txt2
save usingfile_`d', replace
restore
}

* clean school names in DISE file for match later
import delimited "${user}\raw_dise\basic.csv", clear
bysort school_code (ac_year): gen schooldup=_n
egen maxschooldup=max(schooldup), by(school_code)
keep if schooldup==maxschooldup
gen schoolname=strtrim(stritrim(school_name))
replace schoolname=stritrim(strtrim(subinstr(schoolname,".","",.)))
egen id1=group(schoolname distname)
save basictemp, replace

* separate DISE schools into several files by district
use basictemp, clear
foreach d in ala ern idu kan kas kol kot koz mal pal pat thi thr way {
preserve
gen dn = strlower(distname)
keep if strpos(dn,"`d'")==1
keep school_code schoolname id1
rename schoolname txt1
save masterfile_`d', replace
restore
}

* 1st round of matchit
foreach d in ala ern idu kan kas kol kot koz mal pal pat thi thr way {
use masterfile_`d', clear
matchit id1 txt1 using usingfile_`d'.dta, sim(ngram,3) idusing(id2) txtusing(txt2) generate(similarity_score)
save similarity_`d', replace
}

* append matched files
use similarity_ala, clear
foreach d in ern idu kan kas kol kot koz mal pal pat thi thr way {
append using similarity_`d'
}

* manually check matched pairs and sort out duplicated matched pairs, save as match_check
//save match_check, replace

* 2nd round of matchit using unmatched treatment and dise data
use spadtemp, clear //separate unmatched treatment schools into several files by district
merge m:1 id2 using match_check, keep(1) nogen
foreach d in ala ern idu kan kas kol kot koz mal pal pat thi thr way {
preserve
gen dn = strlower(districtname)
keep if strpos(dn,"`d'")==1
keep id2 schoolname schoolid
rename schoolname txt2
save usingfile_`d'2, replace
restore
}
use basictemp, clear //separate unmatched DISE schools into several files by district
merge m:1 id1 using match_check, keep(1) nogen
foreach d in ala ern idu kan kas kol kot koz mal pal pat thi thr way {
preserve
gen dn = strlower(distname)
keep if strpos(dn,"`d'")==1
keep school_code schoolname id1
rename schoolname txt1
save masterfile_`d'2, replace
restore
}
foreach d in ala ern idu kan kas kol kot koz mal pal pat thi thr way {
use masterfile_`d'2, clear //matchit
matchit id1 txt1 using usingfile_`d'2.dta, sim(ngram,3) idusing(id2) txtusing(txt2) generate(similarity_score) threshold(.15)
save similarity_`d'2, replace
}
use similarity_ala2, clear //append matched files
foreach d in ern idu kan kas kol kot koz mal pal pat thi thr way {
append using similarity_`d'2
}

* manually check matched pairs and sort out duplicated matched pairs, save as match_check2
//save match_check2, replace

* 3rd round of matchit using unmatched treatment and dise data
use spadtemp, clear //separate unmatched treatment schools into several files by district
merge m:1 id2 using match_check, nogen
merge m:1 id2 using match_check2, update nogen
keep if missing(similarity_score)
foreach d in ala ern idu kan kas kol kot koz mal pal pat thi thr way {
preserve
gen dn = strlower(districtname)
keep if strpos(dn,"`d'")==1
keep id2 schoolname schoolid
rename schoolname txt2
save usingfile_`d'3, replace
restore
}
use basictemp, clear //separate unmatched DISE schools into several files by district
merge m:1 id1 using match_check, nogen
merge m:1 id1 using match_check2, update nogen
keep if missing(similarity_score)
foreach d in ala ern idu kan kas kol kot koz mal pal pat thi thr way {
preserve
gen dn = strlower(distname)
keep if strpos(dn,"`d'")==1
keep school_code schoolname id1
rename schoolname txt1
save masterfile_`d'3, replace
restore
}
foreach d in ala ern idu kan kas kol kot koz mal pal pat thi thr way {
use masterfile_`d'3, clear //matchit
matchit id1 txt1 using usingfile_`d'3.dta, sim(ngram,3) idusing(id2) txtusing(txt2) generate(similarity_score) threshold(.05)
save similarity_`d'3, replace
}
use similarity_ala3, clear //append matched files
foreach d in ern idu kan kas kol kot koz mal pal pat thi thr way {
append using similarity_`d'3
}

* manually check matched pairs and sort out duplicated matched pairs, save as match_check3
//save match_check3, replace

* merge all 3 rounds of matched pairs to treatment data
use spadtemp, clear
merge m:1 id2 using match_check
rename _m merge1

merge m:1 id2 using match_check2, update
rename _m merge2

merge m:1 id2 using match_check3, update
rename _m merge3

* merge dise school id
preserve
use basictemp, clear
duplicates drop id1, force
tempfile basictemp1
save `basictemp1', replace
restore
merge m:1 id1 using `basictemp1', keepusing(school_code), keep(1 3) nogen

* rename variables
rename txt1 basic_schoolname
rename txt2 spad_schoolname
rename id1 basic_schoolid
rename id2 spad_schoolid
rename school_code schcd
format schcd %20.0g

* drop observations without school code
tostring schcd, gen(school_code) force usedisplayformat 
drop if missing(schcd)

* save data
save "${user}\sanitarypadinstallationdata.dta", replace

* manually check sanitarypadinstallationdata and identify wrong matched pairs (very similar names but different schools), save as wrong.dta
// save "${user}\wrong.dta", replace

* erase unnecessary data
foreach v in masterfile usingfile similarity {
foreach d in ala ern idu kan kas kol kot koz mal pal pat thi thr way {
erase `v'_`d'.dta
erase `v'_`d'.dta
erase `v'_`d'.dta
forval i=2(1)3 {
erase `v'_`d'`i'.dta
erase `v'_`d'`i'.dta
erase `v'_`d'`i'.dta
}
}
}

********************************************************************************
*                Append and Combine Raw DISE Data Files                        *
********************************************************************************

* School Address Info - Basic.csv 2013-2017
import delimited "${user}\raw_dise\basic.csv", clear
rename school_code schcd
format schcd %20.0g
tostring schcd, gen(school_code) force usedisplayformat
drop v1
tempfile basic 
save `basic', replace

* School Infrastructure Info - facility.csv 2013-2017
import delimited "${user}\raw_dise\facility.csv", clear
format schcd %20.0g
tostring schcd, gen(school_code) force usedisplayformat 
drop v1
tempfile facility 
save `facility', replace

* School Administration Info - general.csv 2013-2017
import delimited "${user}\raw_dise\general.csv", clear
format schcd %20.0g
tostring schcd, gen(school_code) force usedisplayformat 
drop v1
tempfile general 
save `general', replace

* School Teacher Info - teachers.csv 2013-2017
import delimited "${user}\raw_dise\teachers.csv", clear
format schcd %20.0g
tostring schcd, gen(school_code) force usedisplayformat 
drop v1
tempfile teachers
save `teachers', replace

* School Additional Info - rte.csv 2013-2017
import delimited "${user}\raw_dise\rte.csv", clear
format schcd %20.0g
tostring schcd, gen(school_code) force usedisplayformat 
drop v1
tempfile rte
save `rte', replace

* School OBC Enrollment - obc_enrolment.csv 2013-2017
import delimited "${user}\raw_dise\obc_enrolment.csv", clear
format schcd %20.0g
tostring schcd, gen(school_code) force usedisplayformat
gen caste="OBC"
rename *_ob *_b
rename *_og *_g
tempfile obc_enrolment
save `obc_enrolment', replace

* School SC Enrollment - sc_enrolment.csv 2013-2017
import delimited "${user}\raw_dise\sc_enrolment.csv", clear
format schcd %20.0g
tostring schcd, gen(school_code) force usedisplayformat
gen caste="SC"
rename *_cb *_b
rename *_cg *_g
tempfile sc_enrolment
save `sc_enrolment', replace

* School ST Enrollment - st_enrolment.csv 2013-2017
import delimited "${user}\raw_dise\st_enrolment.csv", clear
format schcd %20.0g
tostring schcd, gen(school_code) force usedisplayformat
gen caste="ST"
rename *_tb *_b
rename *_tg *_g
tempfile st_enrolment
save `st_enrolment', replace

* School Disabled Enrollment - disabled_enrolment.csv 2013-2017
import delimited "${user}\raw_dise\disabled_enrolment.csv", clear
format schcd %20.0g
tostring schcd, gen(school_code) force usedisplayformat
gen caste="DISABLED"
rename *_dis_* *_*
tempfile disabled_enrolment
save `disabled_enrolment', replace

* School Total Enrollment - total_enrolment.csv 2013-2017
import delimited "${user}\raw_dise\total_enrolment.csv", clear
format schcd %20.0g
tostring schcd, gen(school_code) force usedisplayformat
gen caste="TOTAL"
rename *_totb *_b
rename *_totg *_g
tempfile total_enrolment
save `total_enrolment', replace

* School Repeaters Enrollment - repeaters.csv 2013-2019
import delimited "${user}\raw_dise\repeaters.csv", clear
format schcd %20.0g
append using "${user}\raw_dise\repeater_2018.dta"
append using "${user}\raw_dise\repeater_2019.dta"
tostring schcd, gen(school_code) force usedisplayformat
gen caste="REPEATER"
tempfile repeaters
save `repeaters', replace

* append and combine enrollment data
use `total_enrolment', clear
append using `disabled_enrolment'
append using `obc_enrolment'
append using `sc_enrolment'
append using `st_enrolment'
append using `repeaters'

* append 18-20 enrollment data
preserve
use "${user}\raw_dise\enrolment_1819.dta", clear
format schcd %20.0g
tostring schcd, gen(school_code) force usedisplayformat
replace caste=upper(caste)
tempfile enrolment1819
save `enrolment1819', replace
restore
preserve
use "${user}\raw_dise\enrolment_1920.dta", clear
format schcd %20.0g
tostring schcd, gen(school_code) force usedisplayformat
replace caste=upper(caste)
drop sch_name sch_mgmt sch_cat block_code cluster_code village_code school_type
tempfile enrolment1920
save `enrolment1920', replace
restore

append using `enrolment1819'
append using `enrolment1920'

* save enrollment data
save "${user}\enrolment.dta", replace

* merge school-year level data
use `basic', clear
merge m:1 school_code ac_year using `general', nogen
merge m:1 school_code ac_year using `facility', nogen
merge m:1 school_code ac_year using `teachers', nogen
merge m:1 school_code ac_year using `rte', nogen

* append 18-20 school data
append using "${user}\raw_dise\school_1819.dta"
append using "${user}\raw_dise\school_1920.dta"

* fill missing data
bysort school_code (distname): replace distname=distname[_n+1] if missing(distname)
bysort school_code (pincode): replace pincode=pincode[_n-1] if missing(pincode)
bysort school_code (rururb): replace rururb=rururb[_n-1] if missing(rururb)

* merge treatment data
merge m:1 school_code using "${user}\sanitarypadinstallationdata.dta", nogen

* merge wrongly matched treatment data
merge m:1 school_code using "${user}\wrong.dta", keepusing(wrong) nogen

* save school facility/teacher information data
save "${user}\school.dta", replace

********************************************************************************
*                             Clean Attendance Data                            *
********************************************************************************

* import treated
forval y=2013(1)2019 {
import delimited "${user}\raw_dise\treated_attendance.csv", clear
format schcd %20.0g
keep schcd school_name t_machine_installed t_machine_year year`y'grade*
forval l=5(1)10 {
rename year`y'grade`l'boys a`l'b
rename year`y'grade`l'girls a`l'g
}
gen year=`y'
tempfile year`y'
save `year`y'', replace
}

use `year2013'
forval y=2014(1)2019 {
append using `year`y''
}

* reshape from wide to long
reshape long a5 a6 a7 a8 a9 a10, i(schcd school_name t_machine_installed t_machine_year year) j(genders) string
gen gender=1 if genders=="b"
replace gender=2 if genders=="g"
drop genders

save "${user}\tattendance.dta", replace

********************************************************************************
*                      Clean Data for Estimation Purpose                       *
********************************************************************************

* import enrolment data
use "${user}\enrolment.dta", clear

* merge school facility information
merge m:1 school_code ac_year using "${user}\school.dta", nogen

* drop duplicated observations
duplicates drop schcd-c12_g caste, force

* drop all school-year with multiple observations
bysort schcd ac_year caste: gen dup=_N
keep if dup==1
drop dup

* academic year (take the earlier year, eg 2013-2014 take 2013)
gen year=real(substr(ac_year,1,4))

* rename variable
rename toilet_g toiletg

/*------clean school type (school gender) based on enrolment information------*/

* replace schtype as missing if invalid data entry
replace schtype=. if schtype==0 | schtype==9

* number of boys and girls at school-year level
foreach s in g b {
egen total_`s'=rowtotal(c0_`s' c1_`s' c2_`s' c3_`s' c4_`s' c5_`s' c6_`s' c7_`s' c8_`s' c9_`s' c10_`s' c11_`s' c12_`s' )
bysort schcd (year): egen sum_`s'=sum(total_`s')
}

* drop schools with 0 enrollment for both girls and boys
drop if sum_g==0 & sum_b==0

* gen boy to girl ratio and girl to boy ratio
gen bgratio=sum_b/sum_g
replace bgratio=sum_b/(sum_g+1) if sum_g==0
gen gbratio=sum_g/sum_b
replace gbratio=sum_g/(sum_b+1) if sum_b==0

* check if schools have inconsistent school type across years
egen maxtype=max(schtype), by(schcd)
egen mintype=min(schtype), by(schcd)
gen changetype=1 if maxtype!=mintype

* for schools that have inconsistent school type, replace schtype using information on boys and girls enrollment 
replace schtype=mintype if schtype!=mintype & changetype==1 & mintype==1 & gbratio<0.05
replace schtype=mintype if schtype!=mintype & changetype==1 & mintype==2 & bgratio<0.05
replace schtype=mintype if schtype!=mintype & changetype==1 & mintype==3 & gbratio>0.05 & bgratio>0.05
replace schtype=maxtype if schtype!=maxtype & changetype==1 & maxtype==1 & gbratio<0.05
replace schtype=maxtype if schtype!=maxtype & changetype==1 & maxtype==2 & bgratio<0.05
replace schtype=maxtype if schtype!=maxtype & changetype==1 & maxtype==3 & gbratio>0.05 & bgratio>0.05
drop changetype maxtype mintype
egen maxtype=max(schtype), by(schcd)
replace schtype=maxtype if missing(schtype)
drop maxtype

* drop schools if schtype is inconsistent with number of boys/girls enrollment (e.g., girl schools with schtype=2 recorded large number of boys enrollment in some years) - drop only when bgratio or gbratio is larger than 0.05
bysort schcd (year): gen todrop=1 if (schtype==1 & gbratio>0.05) | (schtype==2 & bgratio>0.05) | (schtype==3 & (gbratio<0.05 | bgratio<0.05))
drop if todrop==1

* drop unnecessary variables
drop todrop total_g total_b sum_g sum_b bgratio gbratio

/*----------------------------------------------------------------------------*/
/*----clean schools' lowclass and highclass based on enrolment information----*/

* identify and replace lowclass for schools that have lowclass inconsistent with enrolment information
global classtotal
forval g=0(1)11 {
local c = `g'+1
global classtotal $classtotal c`g'_g c`g'_b
egen lag_c`c' = rowtotal($classtotal)
}
forval i = 1(1)5 {
gen inconsistent=1 if (lowclass==1 & lag_c1!=0) | (lowclass==2 & lag_c2!=0) | (lowclass==3 & lag_c3!=0) | (lowclass==4 & lag_c4!=0) | (lowclass==5 & lag_c5!=0) | (lowclass==6 & lag_c6!=0) | (lowclass==7 & lag_c7!=0) | (lowclass==8 & lag_c8!=0) | (lowclass==9 & lag_c9!=0) | (lowclass==10 & lag_c10!=0) | (lowclass==11 & lag_c11!=0) | (lowclass==12 & lag_c12!=0)
replace lowclass=lowclass-1 if inconsistent==1
drop inconsistent
}
drop lag_c*

* identify and replace highclass for schools that have highclass inconsistent with enrolment information
global classtotal
forval g=12(-1)1 {
local c = `g'-1
global classtotal $classtotal c`g'_g c`g'_b
egen fw_c`c' = rowtotal($classtotal)
}
forval i = 1(1)11 {
gen inconsistent=1 if (highclass==0 & fw_c0!=0) | (highclass==1 & fw_c1!=0) | (highclass==2 & fw_c2!=0) | (highclass==3 & fw_c3!=0) | (highclass==4 & fw_c4!=0) | (highclass==5 & fw_c5!=0) | (highclass==6 & fw_c6!=0) | (highclass==7 & fw_c7!=0) | (highclass==8 & fw_c8!=0) | (highclass==9 & fw_c9!=0) | (highclass==10 & fw_c10!=0) | (highclass==11 & fw_c11!=0)
replace highclass=highclass+1 if inconsistent==1
drop inconsistent
}
drop fw_c*

* gen the maximum lowclass highclass for each school across years
bysort schcd caste lowclass: gen lowdup=_N if caste=="TOTAL"
bysort schcd caste highclass: gen highdup=_N if caste=="TOTAL"
egen ll=max(lowdup),by(schcd)
egen hh=max(highdup),by(schcd)
gen lll=lowclass if lowdup==ll
gen hhh=highclass if highdup==hh
egen lowclass_all=max(lll), by(schcd)
egen highclass_all=max(hhh), by(schcd)
drop lowdup highdup ll hh lll hhh
/*----------------------------------------------------------------------------*/

* gen dummies for school infrastructure
foreach v in electric library txtbkrecd ramps medchk cal hmroom smc smcsdp {
gen `v'=`v'_yn==1 & `v'_yn!=.
}
gen playground=pground_yn==1 & pground_yn!=.
gen access=approachbyroad==1 & approachbyroad!=.
gen water_hp=water==1 & water!=.
gen water_well=water==2 & water!=. 
gen water_tap=water==3 & water!=. 
gen water_others=water==4 & water!=. 
gen water_none=water==5 & water!=. 
gen bld_private=bldstatus==1 & bldstatus!=.
gen bld_rented=bldstatus==2 & bldstatus!=.
gen bld_govt=(bldstatus==3 | bldstatus==4) & bldstatus!=.
gen bld_uc=bldstatus==7 & bldstatus!=.
gen bld_dilap=bldstatus==6 & bldstatus!=.
gen bld_none=bldstatus==5 & bldstatus!=.
gen wall_pucca=(bndrywall==1 | bndrywall==2) & bndrywall!=.
gen wall_wire=bndrywall==3 & bndrywall!=.
gen wall_hedges=bndrywall==4 & bndrywall!=.
gen wall_na=bndrywall==0 & bndrywall!=.
replace schmgt=schmgmt if missing(schmgt) & year==2019 & schmgmt!=.
bysort schcd: egen minrururb=min(rururb)
replace rururb=minrururb if rururb==9
drop minrururb
gen rural=rururb==1 if !missing(rururb)
gen christian=strpos(school_name,"S.T")>0 | strpos(school_name,"ST")>0 | strpos(school_name,"SAINT")>0 | strpos(school_name,"HOLY")>0 | strpos(school_name,"CHURCH")>0 | strpos(school_name,"SNT")>0
gen public=schmgt==1 | schmgt==2 | schmgt==3 | schmgt==7
gen mix=schtype==3

* gen baseline characteristics (year 2013)
foreach v in schmgt medchk smc smcsdp bookinlib computer electric library playground ramps cal access water_hp water_well water_tap water_others water_none bld_private bld_govt bld_uc bld_dilap bld_none wall_pucca wall_wire wall_hedges wall_na toiletd toiletb toiletg toiletb_func toiletg_func clrooms clgood clmajor clminor{
gen `v'_1313=`v' if year==2013 & caste=="TOTAL"
egen `v'_2013=max(`v'_1313), by(schcd)
}

* drop unnecessary variables
drop *_1313 senr* pass* appr* p60*

* reshape data from wide to long (each school-year has girls and boys enrollment in separate observation rows)
gen id=_n
rename *_g *_2
rename *_b *_1
rename fail*g fail*_2
rename fail*b fail*_1
reshape long c0_ c1_ c2_ c3_ c4_ c5_ c6_ c7_ c8_ c9_ c10_ c11_ c12_ fail0_ fail1_ fail2_ fail3_ fail4_ fail5_ fail6_ fail7_ fail8_ fail9_ fail10_ fail11_ fail12_ ,i(id) j(j)
rename j gender
rename c*_ c*
rename fail*_ fail*
drop id

* gen total enrolment by gender
egen total_enrol=rowtotal(c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12)

* gen total enrolment by school
egen total_enrolbg=total(total_enrol), by(schcd caste year)

* replace enrolment with missing if data is inconsistent with low/highclass information (replace zero values as missing values)
forval i=0(1)12 {
replace c`i'=. if (lowclass<`i' & highclass<`i') | (lowclass>`i' & highclass>`i')
}

* gen treatment variables
gen t_machine_installed=whethersanitarynapkinvendingmach=="Yes" & schtype>=2 & wrong!=1
gen t_machine_year=installedyear if !missing(installedyear) & schtype>=2 & wrong!=1
gen t_machine_working=whetherthemachineisinworkingcond=="Yes" if schtype>=2 & wrong!=1
gen t_machine_nostudent=averagenumberofstudentsutilisedt if schtype>=2 & wrong!=1
gen t_machine_agency=nameoftheagencywhichinstalledthe if schtype>=2 & wrong!=1
gen t_distribution=whetherthefacilityfordistributin=="Yes" & schtype>=2 & wrong!=1
gen t_distribution_nostudent=averagenumberofstudentsutilising if schtype>=2 & wrong!=1
gen t_distribution_agency=agencywhichdistributesthenapkins if schtype>=2 & wrong!=1

* gen group ids
gen districtcode=substr(school_code,1,4)
destring districtcode, replace
gen blockcode=substr(school_code,1,6)
destring blockcode, replace
gen villagecode=substr(school_code,1,9)
destring villagecode, replace
egen distyear=group(year districtcode) if !missing(year) & !missing(districtcode)
egen blockyear=group(year blockcode) if !missing(year) & !missing(blockcode)
egen villageyear=group(year villagecode) if !missing(year) & !missing(villagecode)
egen school_gender=group(schcd gender)
egen school_year=group(schcd year)

* generate previous year & previous cohort for the enrolment of across classes
forval i = 1/11 {
local j = `i' + 1 
bysort schcd caste gender (year): gen c`i'_next = c`j'[_n+1] if year+1==year[_n+1]
}

* gen dependent variable - dropouts
forval i=1(1)10 {
local j = `i' + 1 
bysort schcd caste gender (year): gen chg_enrolment`i'=(c`i'-c`i'_next)/c`i' if !missing(c`i'_next) & !missing(c`i') & lowclass_all<=`i' & highclass_all>=`j'
bysort schcd caste gender (year): replace chg_enrolment`i'=0 if c`i'==c`i'_next & !missing(c`i'_next) & !missing(c`i') & lowclass_all<=`i' & highclass_all>=`j'
}

*gen baseline enrolment 2013
foreach v in total_enrolbg total_enrol c7 {
gen `v'_1313=`v' if year==2013 & caste=="TOTAL"
egen `v'_2013=max(`v'_1313), by(schcd)
}
drop *_1313

* save data
save "${user}\datause.dta",replace

********************************************************************************
*                     generate data for baseline analysis                      *
********************************************************************************
use "${user}\datause.dta", clear

* keep girls only (girls school and mixed school)
keep if caste=="TOTAL" & (gender==2 & (schtype==2 | schtype==3))

* set balanced panel from 2013-2018
forval i=1/10{
bysort school_gender: egen num_chg_enrolment`i' = count(chg_enrolment`i')
gen balancedpanel`i' = num_chg_enrolment`i' == 6
}

* generate treatment period
gen treatment_period=year-t_machine_year if !missing(t_machine_year)
gen aft=treatment_period>=0 & gender==2 & t_machine_installed==1 & !missing(treatment_period)
forval i=1/6 {
gen before`i'=(treatment_period==-`i' & gender==2 & t_machine_installed==1)
lab var before`i' "-`i'"
}
forval i=0/4 {
gen after`i'=(treatment_period==`i' & gender==2 & t_machine_installed==1)
lab var after`i' "`i'"
}

* save data
save "${user}\data.dta",replace

********************************************************************************
*                   Generate enrollments for Lower-Caste Only                  *
********************************************************************************use datause, replace
use "${user}\datause.dta" if caste=="SC" | caste=="ST" | caste=="OBC" | caste=="TOTAL", replace

bysort schcd gender year: egen num = count(caste)
keep if num==4
keep if caste=="SC" | caste=="ST" | caste=="OBC"
keep caste gender schcd year lowclass_all highclass_all c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c12 c11 c0 c*_next
rename c* class*
rename classaste caste

forval i=1(1)12 {
egen c`i'=total(class`i'), by(gender schcd year) missing
}

forval i=1(1)11 {
egen c`i'_next=total(class`i'_next), by(gender schcd year) missing
}

drop class*
duplicates drop schcd year gender c1 c2 c3 c4 c5 c6 c7 c8 c9 c10, force
keep c* gender schcd year lowclass_all highclass_all

forval i = 1(1)11 {
local j = `i'+1
bysort schcd gender (year): gen chg_enrolment`i'=(c`i'-c`i'_next)/c`i' if !missing(c`i'_next) & !missing(c`i') & lowclass_all<=`i' & highclass_all>=`j'
bysort schcd gender (year): replace chg_enrolment`i'=0 if c`i'==c`i'_next & !missing(c`i'_next) & !missing(c`i') & lowclass_all<=`i' & highclass_all>=`j'
}

keep gender schcd year chg_* c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12
rename chg_* castechg_*
forval i=1(1)12 {
rename c`i' castec`i'
}
save "${user}\enrolment_caste.dta",replace

********************************************************************************
*                     generate data for other analyses                         *
********************************************************************************

/*-------------------------------- Table 3.2 ---------------------------------*/ 
use "${user}\datause.dta", clear
keep if caste=="TOTAL" & ((gender==2 & t_machine_installed==1) | (gender==1 & t_machine_installed==0))
forval i=1/10{
bysort school_gender: egen num_chg_enrolment`i' = count(chg_enrolment`i')
gen balancedpanel`i' = num_chg_enrolment`i' == 6
}
gen treatment_period=year-t_machine_year if !missing(t_machine_year)
gen aft=treatment_period>=0 & gender==2 & t_machine_installed==1  & !missing(treatment_period)
save "${user}\data_32.dta",replace

/*---------------------------- Table 3.3 - 3.5 -------------------------------*/ 
use "${user}\datause.dta", clear
gen rid=_n
forval i=1(1)10{
rename chg_enrolment`i' chg_enrolment_grade`i'
}
reshape long chg_enrolment_grade, i(rid) j(grade)
drop if missing(chg_enrolment_grade)
egen school_gender_grade=group(schcd gender grade)
keep if caste=="TOTAL" & t_machine_installed==1
egen num_chg_enrolment = count(chg_enrolment_grade), by(school_gender_grade)
gen balancedpanel = num_chg_enrolment==6
gen treatment_period=year-t_machine_year if !missing(t_machine_year)
gen aft=treatment_period>=0 & gender==2 & t_machine_installed==1 & grade==7  & !missing(treatment_period)
save "${user}\data_33.dta",replace

/*-------------------------------- Table 4.1 ---------------------------------*/ 
use "${user}\datause.dta" , replace
keep if caste=="TOTAL" | caste=="REPEATER"
keep caste gender schcd year lowclass_all highclass_all c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c12 c11 c0 fail*
forval i=0(1)12 {
egen ffail`i'=max(fail`i'), by(gender schcd year)
}
keep if caste=="TOTAL"
forval i = 1/11 {
local j = `i' + 1 
bysort schcd gender (year): gen c`i'_next = c`j'[_n+1] if year+1==year[_n+1]
bysort schcd gender (year): gen fail`j'_next = ffail`j'[_n+1] if year+1==year[_n+1]
bysort schcd gender (year): gen ffail`i'_next = ffail`i'[_n+1] if year+1==year[_n+1]
}
keep c* fail* ffail* gender schcd year lowclass_all highclass_all
forval i = 1/10 {
local j = `i' + 1 
bysort schcd gender (year): gen chg_enrolment`i'=(c`i'-c`i'_next+fail`j'_next-ffail`i'_next)/c`i' if !missing(c`i'_next) & !missing(c`i') & lowclass_all<=`i' & highclass_all>=`j'
bysort schcd gender (year): replace chg_enrolment`i'=0 if c`i'-c`i'_next+fail`j'_next-ffail`i'_next==0 & !missing(c`i'_next) & !missing(c`i') & lowclass_all<=`i' & highclass_all>=`j'

}
keep gender schcd year chg_*
rename chg_* adjusted2chg_*
save "${user}\enrolment_adjusted2.dta",replace

/*-------------------------------- Table 4.3 ---------------------------------*/ 
use "${user}\datause.dta" , replace
keep if caste=="TOTAL" & gender==2 & schtype>=2
duplicates drop schcd, force
keep schcd villagecode year schcat lowclass highclass t_machine_installed t_machine_year
foreach i in 8 12 {
gen grade`i'=lowclass<=`i' & highclass>=`i'
}
collapse (sum) grade8 grade12, by(villagecode)
save "${user}\village_schoolcat2.dta", replace

/*--------------------------------- Table 6 ----------------------------------*/ 
use "${user}\datause.dta", clear
merge m:1 schcd gender year using "${user}\tattendance.dta", keep(1 3) nogen
keep if caste=="TOTAL"
gen treatment_period=year-t_machine_year if !missing(t_machine_year)
gen aft=treatment_period>=0 & gender==2 & t_machine_installed==1 & !missing(treatment_period)
gen bef = treatment_period<-1 & gender==2
forval i=5/10{
bysort school_gender: egen num_a`i' = count(a`i')
gen balancedpanel`i' = num_a`i' == 7
}
save "${user}\data_6.dta",replace

/*-------------------------------- Table 7.4 ---------------------------------*/ 
use "${user}\datause.dta", clear
merge m:1 gender schcd year using "${user}\enrolment_caste.dta", keepusing(castec*) nogen
keep if caste=="TOTAL"
*enrolment ratios
egen total_caste_enrol2013=rowtotal(castec1 castec2 castec3 castec4 castec5 castec6 castec7 castec8 castec9 castec10 castec11 castec12) if year==2013
egen total_caste_enrolbg2013=total(total_caste_enrol2013) if year==2013, by(schcd year)
egen total_caste_enrolbg_2013=max(total_caste_enrolbg2013), by(schcd)
replace total_caste_enrolbg_2013=total_caste_enrolbg_2013/total_enrolbg_2013
egen total_caste_enrolg_2013=max(total_caste_enrol2013), by(schcd)
replace total_caste_enrolg_2013=total_caste_enrolg_2013/total_enrol_2013
keep if gender==2 & (schtype==2 | schtype==3)
forval i=1/10{
bysort school_gender: egen num_chg_enrolment`i' = count(chg_enrolment`i')
gen balancedpanel`i' = num_chg_enrolment`i' == 6
}
gen treatment_period=year-t_machine_year if !missing(t_machine_year)
gen aft=treatment_period>=0 & gender==2 & t_machine_installed==1 & !missing(treatment_period)
gen lowcaste=total_caste_enrolbg_2013>=.6869307 & !missing(total_caste_enrolg_2013)
save "${user}\data_74.dta",replace

/*-------------------------------- Table A13 ---------------------------------*/ 
use "${user}\datause.dta", clear
keep if caste=="TOTAL" & (schtype==2 | schtype==3)
gen treatment_period=year-t_machine_year if !missing(t_machine_year)
forval i=1/10{
replace chg_enrolment`i'=. if chg_enrolment`i'!=. & gender==1 & schtype==2
bysort school_gender: egen num_chg_enrolment`i' = count(chg_enrolment`i')
gen balancedpanel`i' = num_chg_enrolment`i' == 6
}
gen aft=treatment_period>=0 & t_machine_installed==1 & !missing(treatment_period)
save "${user}\data_a13.dta", replace

********************************************************************************
*                            Match PCA to data                                 *
********************************************************************************

import delimited "${user}\PCA_2011\pca_amenities.csv", clear

foreach var in state_code district_code sub_district_code village_code cd_block_code{
replace `var'=subinstr(`var',"'","",.)
destring `var', replace
}

foreach var of varlist _all {
capture confirm string variable `var'
                if !_rc {
                        replace `var'="" if `var'=="NA"
						destring `var', replace
                }
                else {
                        continue
                }
}

* clean variables
foreach v in village_name cd_block_name {
replace `v'=upper(strtrim(trim(`v')))
}
duplicates drop village_name cd_block_name, force

* generate id for match
rename v1 id2
egen blockid2=group(cd_block_name)
egen villageid2=group(village_name)
egen villagepincodeid=group(village_name pin_code), missing
rename cd_block_name blockname2
rename village_name villagename2

* save using data
save "${user}\pca_amenities.dta",replace

* keep block and village names and ids for match
keep blockname2 villageid2 villagename2 blockid2 pin_code villagepincodeid
save "${user}\pca_amenities_prematch.dta",replace

duplicates drop villagepincodeid, force
save "${user}\pca_amenities_prematch2.dta",replace

* clean and save master data for match
use village_name block_name using "${user}\datause.dta", clear
foreach v in village_name block_name {
replace `v'=trim(strtrim(`v'))
}
keep if !missing(village_name)
duplicates drop
egen villageid1=group(village_name)
egen blockid1=group(block_name)
rename village_name villagename1
rename block_name blockname1
save "${user}\merge_temp.dta",replace

* 1st round of matchit using block name
matchit blockid1 blockname1 using pca_amenities_prematch.dta, sim(ngram,2) idusing(blockid2) txtusing(blockname2) generate(similarity_score) override
* manually check matched pairs and sort out duplicated matched pairs, save as blockmerge
* save "${user}\blockmerge.dta"

* 2nd round of matchit using village name
use merge_temp, clear
matchit villageid1 villagename1 using pca_amenities_prematch.dta, sim(ngram,2) idusing(villageid2) txtusing(villagename2) generate(similarity_score) override
* manually check matched pairs and sort out duplicated matched pairs, save as villagemerge
* save "${user}\villagemerge.dta"

* merge matchit results (based on block and village names) to master files
use merge_temp, clear
merge m:1 blockname1 using "${user}\blockmerge.dta", keepusing(blockname2)
rename _merge _merge_block
merge m:1 villagename1 using "${user}\villagemerge.dta", keepusing(villagename2)
rename _merge _merge_village
merge m:1 blockname2 villagename2 using "${user}\pca_amenities.dta", keepusing(blockid2 villageid2)

* save matched results as pca_village_match1
preserve
keep if _m==3
keep blockname1 villagename1 blockname2 villagename2
merge m:1 villagename2 blockname2 using "${user}\pca_amenities.dta", keepusing(village_code cd_block_code) keep(3) nogen
rename blockname1 block_name
rename villagename1 village_name
save pca_village_match_1,replace
save "${user}\pca_village_match1.dta", replace
restore

* save unmatched ones as pca_village_unmatch1 (later run 3rd round of match using pincode)
keep if _m==1
keep blockname1 villagename1
gen unmatch=1
save "${user}\pca_village_unmatch1.dta", replace

* clean masterfile for 3rd round of match
use village_name block_name pincode using "${user}\datause.dta", clear
foreach v in village_name block_name {
replace `v'=trim(strtrim(`v'))
}
keep if !missing(village_name)
duplicates drop

* identify unmatched ones
rename village_name villagename1
rename block_name blockname1
merge m:1 villagename1 blockname1 using "${user}\pca_village_unmatch1.dta", keepusing(unmatch) keep(3) nogen

* generate id for match
duplicates drop villagename1 pincode, force
drop if missing(pincode)
egen id=group(villagename1 pincode)
rename villagename1 villagename
rename pincode pin_code

* 3rd round of match using expact pincode
reclink villagename pin_code using pca_amenities_prematch2.dta, gen(similarityscore) idmaster(id) idusing(villagepincodeid) wmatch(10 2) required(pin_code)

* save matched results as pca_village_match2
preserve
use "${user}\pca_amenities.dta", clear
duplicates drop villagename2 pin_code, force
tempfile pca_amenities_2
save `pca_amenities_2',replace
restore
preserve
keep if _m==3
keep villagename Uvillagename pin_code blockname1
rename villagename villagename1
rename Uvillagename villagename2
merge m:1 villagename2 pin_code using `pca_amenities_2', keepusing(village_code cd_block_code) keep(3) nogen
save "${user}\pca_village_match_2.dta",replace
restore

* for unmatched ones, run 4th round of fuzzy match using pincode
keep if _m==1
keep blockname1 villagename pin_code unmatch id
reclink villagename pin_code using pca_amenities_prematch2.dta, gen(similarityscore) idmaster(id) idusing(villagepincodeid) wmatch(10 5)

* save matched results as pca_village_match3
keep if _m==3
keep villagename Uvillagename pin_code Upin_code blockname1
rename villagename villagename1
rename Uvillagename villagename2
rename pin_code pincode1
rename Upin_code pin_code
merge m:1 villagename2 pin_code using pca_amenities_2, keepusing(village_code cd_block_code), keep(3) nogen
save "${user}\pca_village_match3.dta", replace

* append pca_village_match2 and pca_village_match3
use "${user}\pca_village_match_2.dta", clear
gen pincode1=pin_code
append using "${user}\pca_village_match_3.dta"
sort blockname1 villagename1 pincode1
duplicates list blockname1 villagename1 pincode1
set seed 1234
duplicates drop  blockname1 villagename1 pincode1, force
rename blockname1 block_name
rename villagename1 village_name
rename pincode1 pincode
keep block_name village_name pincode village_code cd_block_code
save "${user}\pca_village_match_2clean.dta",replace

* prepare data for estimation
use "${user}\datause.dta", clear

* obtain matched codes
merge m:1 block_name village_name using "${user}\pca_village_match_1.dta", keepusing(village_code cd_block_code) nogen
rename village_code village_code1
rename cd_block_code cd_block_code1

merge m:1 block_name village_name pincode using "${user}\pca_village_match_2clean.dta", keepusing(village_code cd_block_code) keep(1 3) nogen
rename village_code village_code2
rename cd_block_code cd_block_code2

* obtain amenities info from pca_amenities
gen village_code=village_code1
replace village_code=village_code2 if missing(village_code1) & !missing(village_code2)
gen cd_block_code=cd_block_code1
replace cd_block_code=cd_block_code2 if missing(cd_block_code1) & !missing(cd_block_code2)
merge m:1 village_code cd_block_code using "${user}\pca_amenities.dta", keepusing(x_is_the_area_covered_under_tota community_waste_disposal_system_ post_office__status_a_1__na_2__ telephone__landlines___status_a_ mobile_phone_coverage__status_a_ internet_cafes___common_service_ all_weather_road__status_a_1__na public_distribution_system__pds_ sports_club_recreation_centre__s cinema_video_hall__status_a_1__n public_library__status_a_1__na_2 power_supply_for_domestic_use__s power_supply_for_commercial_use_ tap_water_treated_functioning_al sub_district_head_quarter__dista district_head_quarter__distance_ nearest_statutory_town__distance within_the_state_ut__distance_in outside_the_state_ut_distance__i total_geographical_area__in_hect total__households_ total_population_of_village total_male_population_of_village total_female_population_of_villa total_scheduled_castes_populatio total_scheduled_castes_male_popu total_scheduled_castes_female_po total_scheduled_tribes_populatio total_scheduled_tribes_male_popu total_scheduled_tribes_female_po community_health_centre__numbers primary_health_centre__numbers_ primary_health_sub_centre__numbe maternity_and_child_welfare_cent family_welfare_centre__numbers_ non_government_medical_facilitie v207 v208 x_is_the_area_covered_under_tota community_toilet_complex__includ community_toilet_complex__exclud rural_production_centres_or_sani rural_production_mart_or_sanitar telephone__landlines___status_a_ public_call_office__mobile__pco_ mobile_phone_coverage__status_a_ internet_cafes___common_service_ private_courier_facility__status public_bus_service__status_a_1__ private_bus_service__status_a_1_ railway_station__status_a_1__na_ auto_modified_autos__status_a_1_ taxi__status_a_1__na_2__ vans__status_a_1__na_2__ tractors__status_a_1__na_2__ national_highway__status_a_1__na state_highway__status_a_1__na_2_ major_district_road__status_a_1_ other_district_road__status_a_1_ black_topped__pucca__road__statu gravel__kuchha__roads__status_a_ all_weather_road__status_a_1__na nutritional_centres_icds__status nutritional_centres_anganwadi_ce mandis_regular_market__status_a_ public_distribution_system__pds_ asha__status_a_1__na_2__ community_centre_with_without_tv nearest_town_distance_from_villa) keep(3) nogen

* keep girls only (girls school and mixed school)
keep if gender==2 & (schtype==2 | schtype==3) & caste=="TOTAL"

* set balanced panel from 2013-2018
forval i=1/10{
bysort school_gender: egen num_chg_enrolment`i' = count(chg_enrolment`i')
gen balancedpanel`i' = num_chg_enrolment`i' == 6
}

* gen variables
gen publicbus=public_bus_service__status_a_1__==1
gen highway=national_highway__status_a_1__na==1
gen chc=community_health_centre__numbers==1
gen phc=primary_health_centre__numbers_>=1 | primary_health_sub_centre__numbe>=1
gen fwc=family_welfare_centre__numbers_>=1
gen mcw=maternity_and_child_welfare_cent>=1
gen rpm=rural_production_mart_or_sanitar==1
gen internet=internet_cafes___common_service_==1
gen courier=private_courier_facility__status==1

* gen treatment variables
gen treatment_period=year-t_machine_year if !missing(t_machine_year)
gen aft=treatment_period>=0 & gender==2 & t_machine_installed==1 & !missing(treatment_period)

save "${user}\datapc.dta", replace





















